In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.cluster import KMeans, AgglomerativeClustering, AffinityPropagation, DBSCAN
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 10)
pd.set_option('display.width', 100)
print("Libraries imported successfully.")
for dirname, _, filenames in os.walk('input'):
for filename in filenames:
print(os.path.join(dirname, filename))
Libraries imported successfully. input/test.csv input/train.csv input/gender_submission.csv
In [2]:
#Обробка даних
df = pd.read_csv("input/train.csv")
display(df.head(10))
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
| 5 | 6 | 0 | 3 | Moran, Mr. James | male | NaN | 0 | 0 | 330877 | 8.4583 | NaN | Q |
| 6 | 7 | 0 | 1 | McCarthy, Mr. Timothy J | male | 54.0 | 0 | 0 | 17463 | 51.8625 | E46 | S |
| 7 | 8 | 0 | 3 | Palsson, Master. Gosta Leonard | male | 2.0 | 3 | 1 | 349909 | 21.0750 | NaN | S |
| 8 | 9 | 1 | 3 | Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) | female | 27.0 | 0 | 2 | 347742 | 11.1333 | NaN | S |
| 9 | 10 | 1 | 2 | Nasser, Mrs. Nicholas (Adele Achem) | female | 14.0 | 1 | 0 | 237736 | 30.0708 | NaN | C |
In [3]:
import ydata_profiling
report = ydata_profiling.ProfileReport(df)
display(report)
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
In [4]:
data = pd.read_csv("input/train.csv",sep=",")
data1 = data[['Survived','Pclass','Age','SibSp','Parch','Fare']]
print(data1.head())
data1.info()
Survived Pclass Age SibSp Parch Fare 0 0 3 22.0 1 0 7.2500 1 1 1 38.0 1 0 71.2833 2 1 3 26.0 0 0 7.9250 3 1 1 35.0 1 0 53.1000 4 0 3 35.0 0 0 8.0500 <class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Survived 891 non-null int64 1 Pclass 891 non-null int64 2 Age 714 non-null float64 3 SibSp 891 non-null int64 4 Parch 891 non-null int64 5 Fare 891 non-null float64 dtypes: float64(2), int64(4) memory usage: 41.9 KB
In [5]:
data2=data1.dropna(axis=0).reset_index(drop=True)
In [6]:
#Кластеризація методом к-середніх
del data2
data2=data1.dropna(axis=0).reset_index(drop=True)
def doKmeans(X, nclust=2):
model = KMeans(nclust)
model.fit(X)
clust_labels = model.predict(X)
cent = model.cluster_centers_
return (clust_labels, cent)
clust_labels, cent = doKmeans(data2, 5)
kmeans = pd.DataFrame(clust_labels)
data2.insert((data2.shape[1]),'kmeans',kmeans)
data2.groupby('kmeans').mean()
Out[6]:
| Survived | Pclass | Age | SibSp | Parch | Fare | |
|---|---|---|---|---|---|---|
| kmeans | ||||||
| 0 | 0.283224 | 2.544662 | 32.125272 | 0.224401 | 0.217865 | 13.108678 |
| 1 | 0.764706 | 1.000000 | 31.235294 | 0.941176 | 1.352941 | 231.153676 |
| 2 | 0.686047 | 1.069767 | 35.906047 | 0.627907 | 0.488372 | 94.726500 |
| 3 | 1.000000 | 1.000000 | 35.333333 | 0.000000 | 0.333333 | 512.329200 |
| 4 | 0.570470 | 2.127517 | 18.354027 | 1.295302 | 0.953020 | 34.509536 |
In [7]:
fig = plt.figure()
ax = fig.add_subplot(111)
scatter = ax.scatter(data2['Age'],data2['Fare'],c=kmeans[0],s=50)
ax.set_title('K-Means Clustering')
ax.set_xlabel('Age')
ax.set_ylabel('Fare')
plt.colorbar(scatter)
Out[7]:
<matplotlib.colorbar.Colorbar at 0x7467f304f1d0>
In [8]:
#Ієрархічна кластеризація
del data2
data2=data1.dropna(axis=0).reset_index(drop=True)
merg = linkage(data2[0:10], method="ward")
print(merg)
[[ 2. 7. 3.9106507 2. ] [ 6. 9. 5.30477379 2. ] [ 0. 10. 6.15841156 3. ] [ 4. 12. 12.34003146 4. ] [ 8. 11. 18.2324326 3. ] [ 1. 3. 18.42911823 2. ] [ 5. 15. 23.5213758 3. ] [ 13. 14. 46.58343356 7. ] [ 16. 17. 102.82744677 10. ]]
In [9]:
fig = plt.figure()
ax = fig.add_subplot(111)
dendrogram(merg, leaf_rotation = 0)
ax.set_title('Hierarchical Clustering')
ax.set_xlabel('data points')
ax.set_ylabel('euclidean distance')
Out[9]:
Text(0, 0.5, 'euclidean distance')
In [10]:
data2[0:10]
Out[10]:
| Survived | Pclass | Age | SibSp | Parch | Fare | |
|---|---|---|---|---|---|---|
| 0 | 0 | 3 | 22.0 | 1 | 0 | 7.2500 |
| 1 | 1 | 1 | 38.0 | 1 | 0 | 71.2833 |
| 2 | 1 | 3 | 26.0 | 0 | 0 | 7.9250 |
| 3 | 1 | 1 | 35.0 | 1 | 0 | 53.1000 |
| 4 | 0 | 3 | 35.0 | 0 | 0 | 8.0500 |
| 5 | 0 | 1 | 54.0 | 0 | 0 | 51.8625 |
| 6 | 0 | 3 | 2.0 | 3 | 1 | 21.0750 |
| 7 | 1 | 3 | 27.0 | 0 | 2 | 11.1333 |
| 8 | 1 | 2 | 14.0 | 1 | 0 | 30.0708 |
| 9 | 1 | 3 | 4.0 | 1 | 1 | 16.7000 |
In [11]:
#Агломеративне кластерування
def doAgglomerative(X, nclust=2):
model = AgglomerativeClustering(n_clusters=nclust, metric = 'euclidean', linkage = 'ward')
clust_labels1 = model.fit_predict(X)
return (clust_labels1)
clust_labels1 = doAgglomerative(data2, 5)
agglomerative = pd.DataFrame(clust_labels1)
data2.insert((data2.shape[1]),'agglomerative',agglomerative)
In [12]:
data2.groupby('agglomerative').mean()
Out[12]:
| Survived | Pclass | Age | SibSp | Parch | Fare | |
|---|---|---|---|---|---|---|
| agglomerative | ||||||
| 0 | 0.334507 | 2.514085 | 28.484595 | 0.448944 | 0.394366 | 15.877105 |
| 1 | 0.733333 | 1.000000 | 32.430667 | 0.600000 | 0.866667 | 131.183883 |
| 2 | 0.653061 | 1.234694 | 35.632653 | 0.795918 | 0.377551 | 68.176576 |
| 3 | 1.000000 | 1.000000 | 35.333333 | 0.000000 | 0.333333 | 512.329200 |
| 4 | 0.733333 | 1.000000 | 30.333333 | 1.000000 | 1.333333 | 239.991940 |
In [13]:
%matplotlib inline
fig = plt.figure()
ax = fig.add_subplot(111)
scatter = ax.scatter(data2['Age'],data2['Fare'],c=agglomerative[0],s=50)
ax.set_title('Agglomerative Clustering')
ax.set_xlabel('Age')
ax.set_ylabel('Fare')
plt.colorbar(scatter)
Out[13]:
<matplotlib.colorbar.Colorbar at 0x7467f2a83290>
In [14]:
plt.show()
In [15]:
#DBSCAN
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
numeric_features = ['Age', 'SibSp', 'Parch', 'Fare']
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())])
categorical_features = ['Pclass', 'Sex', 'Embarked']
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)])
X = preprocessor.fit_transform(data[features])
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
eps = 0.5
min_samples = 5
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
clusters = dbscan.fit_predict(X_pca)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='viridis', s=50, alpha=0.7)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('DBSCAN Clustering for Titanic Data')
plt.show()
In [ ]:
In [ ]: